##Replication code for "Understanding Political Communication and Political Communicators on Twitch"
##Data sampling for topic modeling
##Sangyeon Kim

#SAMPLING 5% of chat posts from each channel
#1.sample top20 

load("./raw_data/top20_bychannel.rda")

#dropping out channels with only one chat, that are all not in the list
top20_bychannel[[22]]<-NULL
top20_bychannel[[21]]<-NULL
top20_bychannel[[20]]<-NULL
top20_bychannel[[19]]<-NULL

set.seed(1542)
samp_top20<-vector(mode="list",length=18)

for(i in 1:18){
  print(i)
  df<-top20_bychannel[[i]]
  samp_top20[[i]]<-df[sample(nrow(df), round(nrow(df)/20)), ]
}

rm(top20_bychannel)

#2.sample top21 to top100
load("./raw_data/top20_100_bychannel.rda")

samp_top21_100<-vector(mode="list",length=length(top20_100_bychannel))

set.seed(1542)
for(i in 1:length(samp_top21_100)){
  print(i)
  df<-top20_100_bychannel[[i]]
  samp_top21_100[[i]]<-df[sample(nrow(df), round(nrow(df)/20)), ]
}

#save(samp_top21_100,file="tensamp_top21_100_0608.rda")

#3.sample top101 to top337
load("./raw_data/top101_337_bychannel.rda")

#before go in, get rid of the data with only one observation, as it does not show any meaningful patterns
oneornot<-vector(mode="logical",length=length(top101_337_bychannel))
for(i in 1:length(oneornot)){
  oneornot[[i]]<-length(top101_337_bychannel[[i]]$username)>1
}
one<-which(oneornot==FALSE)

top101_337_bychannel[[one[[4]]]]<-NULL
top101_337_bychannel[[one[[3]]]]<-NULL
top101_337_bychannel[[one[[2]]]]<-NULL
top101_337_bychannel[[one[[1]]]]<-NULL

samp_top101_337<-vector(mode="list",length=length(top101_337_bychannel))

set.seed(1542)
for(i in 1:length(samp_top101_337)){
  print(i)
  df<-top101_337_bychannel[[i]]
  if(nrow(df)>20){
    samp_top101_337[[i]]<-df[sample(nrow(df), round(nrow(df)/20)), ]
  }
  else{
    samp_top101_337[[i]]<-df
  }
}

#save(samp_top101_337,file="tensamp_top101_337_0608.rda")

#4.top338 to top574

load("./raw_data/top338_574_bychannel.rda")
#before go in, get rid of the data with only one observation, as it does not show any meaningful patterns
oneornot<-vector(mode="logical",length=length(top338_574_bychannel))
for(i in 1:length(oneornot)){
  oneornot[[i]]<-length(top338_574_bychannel[[i]]$username)>1
}
one<-which(oneornot==FALSE)
top338_574_bychannel[[one[[8]]]]<-NULL
top338_574_bychannel[[one[[7]]]]<-NULL
top338_574_bychannel[[one[[6]]]]<-NULL
top338_574_bychannel[[one[[5]]]]<-NULL
top338_574_bychannel[[one[[4]]]]<-NULL
top338_574_bychannel[[one[[3]]]]<-NULL
top338_574_bychannel[[one[[2]]]]<-NULL
top338_574_bychannel[[one[[1]]]]<-NULL

samp_top338_574<-vector(mode="list",length=length(top338_574_bychannel))

set.seed(1542)
for(i in 1:length(samp_top338_574)){
  print(i)
  df<-top338_574_bychannel[[i]]
  if(nrow(df)>20){
    samp_top338_574[[i]]<-df[sample(nrow(df), round(nrow(df)/20)), ]
  }
  else{
    samp_top338_574[[i]]<-df
  }
}

#5. Aggregate all the sampled data

sampled_all<-samp_top20[[1]]
for(i in 2:length(samp_top20)){
  print(i)
  sampled_all<-rbind(sampled_all,samp_top20[[i]])
}

for(i in 1:length(samp_top21_100)){
  print(i)
  sampled_all<-rbind(sampled_all,samp_top21_100[[i]])
}

for(i in 1:length(samp_top101_337)){
  print(i)
  sampled_all<-rbind(sampled_all,samp_top101_337[[i]])
}

for(i in 1:length(samp_top338_574)){
  print(i)
  sampled_all<-rbind(sampled_all,samp_top338_574[[i]])
}

save(sampled_all,file="fig9_to_13.rda")
